*******************************************************************
*************** CLEANING THE DLHS-4 CAB DATASET *******************
*******************************************************************

{ /*** DO-FILE DESCRIPTION ***/

/* 
this do file will prepare the DLHS4 CAB dataset for our data analysis.
New variables needed for the asset index and the analysis itslef are generated, 
implausible values are set to missing and households are identified aswell as 
their composition with respect to sex and age. Furthermore, child and infant 
mortality rates are calculated at the household level. Finally, a sampling weight,
that adjusts for state size, is created. 
*/

/*
DATA SETS USED:
DLHS-4 cab
DLHS-4 HH
DLHS-4 EW
adultpopulationbystate
*/

/*
DATA SETS CREATED:
final:
DLHS_cleaned // basic cleaning was done
HH_mort // contains number of deceased infants and children per HH
EW_mort // contains number of deceased household members born to any of the women
			in the HH
DLHS_relativepopulation_census // contains state wise relative population according to Census2011
relativepopulation_cab // // contains state wise relative population according to DLHS CAB data

DLHS_mort // final data set that will be used for analysis

temporary:
DLHS_temp
*/

/*
VARIABLES CREATED:
adulthh							height
age1							hhid
age2							hhsize
asset variables					hyptertension cab variables
BMI variables					hypertension diagnosed	
childhh							malehh
diabetes cab variables			married
diabetes diagnosed				pregnant
DLHS							religion
educ							
educ_cont						rural
female							state_dist
femalehh						weight
glucose							
*/

/* 
GLOBALS USED:
rawdlhs: directory that contains the DLHS raw data
temp: directory where temporary data sets are stored
final: directory where final data sets are store
pop: contains population data used for sampling weights
*/

}

clear
version 13

global rawdlhs "C:\Users\Michaela\Desktop\rawdata\DLHS-4\"
global temp "C:\Users\Michaela\Desktop\manipulated data\temp\"
global final "C:\Users\Michaela\Desktop\manipulated data\final\"
global pop "C:\Users\Michaela\Desktop\rawdata\population data\"

use "$rawdlhs\DLHS-4 cab"

{ /*** DROPPING DUPLICATES ***/
* only keep usual residents	
	keep if hv06 == 1

	duplicates drop
	*isid primekeynew // no
	* misstable sum if primekeynew == "" // shows that there are only entries for state, district, hv90, hv90a
	drop if primekeynew == ""
	
	drop if rcvid == . // these observations only contained missings
	
	duplicates tag primekeynew, gen(duplnew)
	duplicates tag prim_key, gen(duplkey)
	duplicates tag prim_key primekeynew, gen(duplboth)
	
/* now the same observations have duplicates in both prims (3692 observations)
	the duplicates we still have are all caused by different values in hv90a (haemoglobin level)
	as this variable will be relevant for one of our analysis, we will set the 
		hemoglobin values to missing for the duplicate observations and then
		drop the duplicates
*/
	replace hv90a = . if duplboth == 1
	duplicates drop prim_key, force
	drop duplnew duplkey duplboth
	
	count if prim_key != primekeynew // both are the same for all observations
	drop primekeynew




}

{ /*** GENERATING AND CLEANING VARIABLES - Household & individual characteristics ***/

* unique district identifier
	gen state_dist = string(state, "%02.0f") + string(dist, "%02.0f")
	order state dist state_dist
	
* gender
* the category "other" is set to missing
	gen female = 1 if hv05 == 2
	replace female = 0 if hv05 == 1
		
* categorical education variable
	gen educ = 1 if hv12 <= 2 | hv12 == 99
	replace educ = 2 if hv12 == 3
	replace educ = 3 if hv12 == 4
	replace educ = 4 if hv12 == 5 
	replace educ = 5 if hv12 == 6
	replace educ = 6 if hv12 <= 10 & hv12 >= 7
	
* continuous education variable
	gen educ_cont = hv13 if hv13 <= 25
	
* religion dummies
	gen religion = hv30
	replace religion = 96 if hv30 == 7 | hv30 == 8
	
	tab religion, gen(rel)
	
	foreach var of varlist rel1 - rel8 {
		lab var `var' "hh head belongs to respective religion"
		lab val `var' yesno
	}
	
	#delimit;
	rename (rel1 rel2 rel3 rel4 rel5 rel6 rel7 rel8) 
		(hindu muslim christian sikh buddhist jain noreligion otherreligion);
	#delimit cr
	 drop religion

* caste
	gen caste = hv31b
	replace caste = 3 if caste == 6
	replace caste = 3 if hv31a == 3


*** generate two age variables for each age variable available ***

* Q08 "Age in completed years
	gen age1 = hv08
	replace age1 = . if age1 > 110
	
* Q79 age in days, months and years (CAB part)
/* 
hv79y: values from 1 upwards
hv79m: values from 1 to 11
hv79d: values from 1 to 29
*/
	gen age2 = hv79y
	replace age2 = 0 if !missing(hv79m, hv79d) & missing(hv79y)
	replace age2 = . if age2 > 110	
	
* marital status
	gen married = inlist(hv09, 2,3,4)
	replace married = . if hv09 == 8 | hv09 == .
	
* rural or urban location of household
	gen rural = 1 if htype == 1
	replace rural = 0 if htype == 2
	
* diagnosis of diabetes and hypertension
/* 
missings in hv23 are coded as 0 in the new variables because
they are not only caused by non-response but also by a skip pattern
(no chronic illness during the last year)
*/
	gen diab_diagnosed = 1 if hv23 == 1
	replace diab_diagnosed = 0 if hv23 != 1
		
	gen htn_diagnosed = 1 if hv23 == 2
	replace htn_diagnosed = 0 if hv23 != 2
		
* treatment of diabetes and hypertension
/*
missings in hv25 are coded as 0 in the new variables as they can be caused 
by non-response as well as the skip pattern
*/
	gen diab_treated = (diab_diagnosed == 1 & hv25 == 2)
	
	gen htn_treated = (htn_diagnosed == 1 & hv25 == 2)
		
* being fasted on survey day
	gen fasted = 1 if hv91 == 2
	replace fasted = 0 if hv91 == 1

* pregnancy status
/* 
generate a variable for an individual being pregnant or lactating and pregnant.
Males are originally coded as missing (not as not-pregnant). 
*/	
	gen pregnant = (hv81 == 1 | hv81 == 3) if hv81 != .
	replace pregnant = 0 if female == 0
	replace pregnant = 0 if age1 < 10
	replace pregnant = 0 if hv80 == 2
	
* height
	gen height = hv85/100
	replace height = . if height < 0.6 | height > 2.5

* weight
	gen weight = hv82 + (hv82a/1000) if hv82 != . & hv82a != .
	replace weight = hv82 if hv82a == .
	replace weight = hv82a/1000 if hv82 == .
	replace weight = . if weight < 30 | weight > 160

* BMI
	gen BMI = weight/height^2 if weight != . & height != .
	replace BMI = . if BMI < 12 | BMI > 80
	
	gen overweight = (BMI > 25 & BMI <=30)
	replace overweight = . if BMI == .
	
	gen obese = (BMI > 30 & BMI != .)
	replace obese = . if BMI == .
	
	gen BMIgrt25 = (BMI > 25 & BMI != .)
	replace BMIgrt25 = . if BMI == .
	
/* gen var indicating that these observations are from the DLHS, when we later
	append DLHS and AHS */
	gen DLHS = 1
}
	
{ /*** HOUSEHOLD ID AND COMPOSITION ***/
*** Household ID ***
	gen hhid = substr(prim_key, 1,13)
	destring hhid, replace
	format hhid %15.0g
	
* number of HH members
	sort hhid
	by hhid: egen hhsize = count(hhid)
	
* number of adults and children per HH
	gen child = (age1 < 18)
	gen adult = (age1 > 17) if age1 != .
	
	by hhid: egen childhh = total(child)
	by hhid: egen adulthh = total(adult)
	
/* adulthh and childhh might be used for equivalence scales. 
	It is not decided how HH members with missing age1 are treated. In order 
	to avoid mistakes, the numbers of children and adults are currently set to missing for households
	where at least one HH member has a missing value in age1.
*/
	by hhid: egen missage = max(missing(age1))
	replace childhh = . if missage == 1
	replace adulthh = . if missage == 1
	drop missage
	
* generate variable for number of males and females per HH 
/* femalehh and malehh are set to missing for the households that have 
	at least one member with a missing value in "female". This avoids mistakes
	if we calculate a sex ratio on the hh level at a later point in time.
*/
	by hhid: egen femalehh = total(female)
	
	gen male = (female == 0)
	by hhid: egen malehh = total(male)

	by hhid: egen misssex = max(missing(female))
	replace malehh = . if misssex == 1
	replace female = . if misssex == 1
	
	drop misssex child adult male 
}	
	

{ /*** GENERATING VARIABLES - GLUCOSE, BLOOD PRESSURE AND HEMOGLOBIN MEASUREMENTS ***/
/***************************************************************************
For the following variables, missings in the variables from the raw data set
used to generate new variables are kept as missings.
****************************************************************************/	
	
* generate variables regarding diabetes
	gen glucose = hv91a if hv91a >= 40 & hv91a <= 600
* converting capillary to venous blood glucose level
	replace glucose = glucose*1.11
	
	gen glucgrt200 = 1 if glucose >= 200 & glucose != .
	replace glucgrt200 = 0 if glucose < 200
	
	gen diab_narrow = 1 if ((fasted == 1 & glucose >= 126) | (fasted !=1 & glucose >= 200)) & glucose != .
	replace diab_narrow = 0 if (fasted == 1 & glucose < 126) | (fasted !=1 & glucose < 200)
	
	gen diab_broad = (diab_narrow == 1 | diab_treated == 1)
	replace diab_broad = . if diab_narrow == . & diab_treated == .
	
	
* generate variables regarding hypertension
	gen bpsyst_frst = hv93a if hv93a >= 70 & hv93a <= 240
	gen bpsyst_sec = hv93b if hv93b >= 70 & hv93b <= 240
	
	gen bpsyst_avg = (bpsyst_frst + bpsyst_sec) / 2 if bpsyst_frst != . & bpsyst_sec != .
	replace bpsyst_avg = bpsyst_frst if missing(bpsyst_sec)
	replace bpsyst_avg = bpsyst_sec if missing(bpsyst_frst)
	
	gen bpdiast_frst = hv94a if hv94a >= 30 & hv94a <= 120
	gen bpdiast_sec = hv94b if hv94b >= 30 & hv94b <= 120
	
	gen bpdiast_avg = (bpdiast_frst + bpdiast_sec) / 2 if bpdiast_frst != . & bpdiast_sec != .
	replace bpdiast_avg = bpdiast_frst if missing(bpdiast_sec)
	replace bpdiast_avg = bpdiast_sec if missing(bpdiast_frst)
	
	gen htn_narrow_avg = 1 if (bpsyst_avg >= 140 & !missing(bpsyst_avg)) | (bpdiast_avg >= 90 & !missing(bpdiast_avg))
	replace htn_narrow_avg = 0 if bpsyst_avg < 140 & bpdiast_avg < 90
	
	gen htn_broad_avg = (htn_narrow_avg == 1 | htn_treated == 1)
	replace htn_broad_avg = . if htn_narrow_avg == . & htn_treated == .
	
	gen htn_narrow_sec = 1 if (bpsyst_sec >= 140 & !missing(bpsyst_sec)) | (bpdiast_sec >= 90 & !missing(bpdiast_sec))
	replace htn_narrow_sec  = 0 if bpsyst_sec < 140 & bpdiast_sec < 90	
	
	gen htn_broad_sec = (htn_narrow_sec == 1 | htn_treated == 1)
	replace htn_broad_sec = . if htn_narrow_sec == . & htn_treated == .
	
	gen htn_stage2_avg = (bpsyst_avg >= 160 & !missing(bpsyst_avg)) | (bpdiast_avg >= 100 & !missing(bpdiast_avg))
	replace htn_stage2_avg = . if bpsyst_avg == . & bpdiast_avg == .	
	
	gen htn_stage2_sec = (bpsyst_sec >= 160 & !missing(bpsyst_sec)) | (bpdiast_sec >= 100 & !missing(bpdiast_sec))
	replace htn_stage2_sec = . if bpsyst_sec == . & bpdiast_sec == .
		
	gen htnurgency_avg = (bpsyst_avg >= 180 & !missing(bpsyst_avg)) | (bpdiast_avg >= 110 & !missing(bpdiast_avg))
	replace htnurgency_avg = . if bpsyst_avg == . & bpdiast_avg == .
		
	gen htnurgency_sec = (bpsyst_sec >= 180 & !missing(bpsyst_sec)) | (bpdiast_sec >= 110 & !missing(bpdiast_sec))
	replace htnurgency_sec = . if bpsyst_sec == . & bpdiast_sec == .
	
* generating variables regarding the hemoglobin level / anemia
/* There are 31.9% missing values in hv90a 
	Together with implausible values we will have 32.47% of missing values
*/	
	gen hemoglobin = hv90a
	replace hemoglobin = . if hemoglobin < 5 | hemoglobin > 22
	
* anemia variables
/* The cut-offs are defined according to the WHO http://www.who.int/vmnis/indicators/haemoglobin/en/
	The values will be set missing for individuals <18y and pregnant women.
			However, for completeness (and to be able to check plausibility), 
			the code will be kept here.
*/
	gen anaemia = 0
	replace anaemia = 1 if hemoglobin < 11 & age1 < 5 
	replace anaemia = 1 if hemoglobin < 11.5 & age1 >= 5 & age1 < 12
	replace anaemia = 1 if hemoglobin < 12 & age1 >= 12 & age1 < 15
	replace anaemia = 1 if hemoglobin < 12 & female == 1 & pregnant == 0 & age1 >= 15 & age1 != .
	replace anaemia = 1 if hemoglobin < 11 & female == 1 & pregnant != 0 & age1 >= 15 & age1 != .
	replace anaemia = 1 if hemoglobin < 13 & female == 0 & age1 >= 15 & age1 != .

	
* severe anaemia
	gen sevanaemia = 0
	replace sevanaemia = 1 if hemoglobin < 7 & age1 < 5 
	replace sevanaemia = 1 if hemoglobin < 8 & age1 >= 5 & age1 < 15
	replace sevanaemia = 1 if hemoglobin < 8 & female == 1 & pregnant == 0 & age1 >= 15 & age1 != .
	replace sevanaemia = 1 if hemoglobin < 7 & female == 1 & pregnant != 0 & age1 >= 15 & age1 != .
	replace sevanaemia = 1 if hemoglobin < 8 & female == 0 & age1 >= 15 & age1 != .
	replace sevanaemia = . if hemoglobin == . | age1 == .
	replace sevanaemia = . if age1 >= 15 & female == .
		
* Restrict sample to relevant observations
/* 
Set values for these variables missing for pregnant women as well as individuals
younger that 18 years.
*/

* generate a variable for individuals who are under 18 based on the age1 variable
	gen dropage = (age1 < 18)
	

* set values to missing for individuals that will not be part of our analysis
	#delimit;
	foreach var of varlist glucose glucgrt200 diab_narrow diab_broad bpsyst_frst 
			bpsyst_avg bpdiast_frst bpdiast_avg htn_narrow_avg htn_broad_avg 
			htn_narrow_sec htn_broad_sec htn_stage2_avg htn_stage2_sec htnurgency_avg 
			htnurgency_sec height weight BMI overweight obese BMIgrt25 { ;
	#delimit cr
		 
		replace `var' = . if pregnant == 1 | dropage == 1
	}
}
{ /*** GENERATING VARIABLES - HOUSEHOLD ASSETS ***/

* count missing values
	/* for the asset index none of the assets used should have more than 3% of 
		missing values */	
	mdesc hv32 hv35 hv39 hv40 hv44 hv45 hv46a hv46b hv46c hv46e hv46g hv46h hv46i hv46k hv46l hv46m hv47
* The highest share of missings is in hv47 (ownership) with 0.11%



* Improved source of drinking water
	* The definition of improved water supply follows the definition of the WHO
	gen impwater = inlist(hv32, 1,2,3,4,5,7,9) 
	
* Sanitation facility
/* 
	The definition of improvd sanitation facility follows the definition of the WHO.
	The WHO defines "Flush don't know where" (0.29%)as improved as the respondent might
		not know whether the water is flushed to a septic tank or a piped sewer
		system. "Flush to somewhere" (0.96%) is not specifically defined by the WHO. I 
		would classify it as "not improved" as it might also mean that the waste 
		is deposited right next to the homestead.
	Furthermore, only non-shared facilities are considered "improved".
		For "other toilet facility" and "field/open space" this variable should 
		not indicate "not shared" but this is the case in 58 and 1929 observations
		respectively. This will be cleaned.
*/
	gen notshared = (hv36==3)
	replace notshared = 0 if hv35 == 51 | hv35 == 96
	
	lab val notshared yesno
	
	gen impsani = ((hv35 <= 13 | hv35 == 15 | hv35 == 21 | hv35 == 22 | hv35 == 31) & notshared == 1)
	
* cooking fuel	
/*
	The WHO and IEA define modern fuels to be LPG, Electricity and Biogas.
*/
	gen fuel = inlist(hv39, 6,7,8)
	
	
* type of house
/* There is no official categorization of house types. We will define it in a conservative 
	way and only count Puccha as improved. 
*/
	gen house = (hv40 == 1)

	
* main source of lighting
* Electricity and solar are considered improved because they are "clean"
	gen light = inlist(hv44, 1,3)
	
* house ownership status
	gen owner = (hv45 == 1)
	
* Assets
	gen radio = (hv46a == 1)
	gen tv = (hv46b == 1)
	gen comp = (hv46c == 1 | hv46d == 1)
	gen phone = (hv46e == 1 | hv46f == 1)
	gen wash = (hv46g == 1)
	gen refri = (hv46h == 1)
	gen sewing = (hv46i == 1)
	gen bike = (hv46k == 1)
	gen scooter = (hv46l == 1)
	gen car = (hv46m == 1)

	
* any land owned
	gen land = (hv47 == 1)
}


{ /*** LABELLING VARIABLES AND VALUES ***/

	#delimit ; 
	lab def state2 2 "Himachal Pradesh" 3 "Punjab" 4 "Chandigarh" 6 "Haryana"
			7 "NCT of Delhi" 11 "Sikkim" 12 "Arunachal Pradesh" 13 "Nagaland" 14 "Manipur"
			15 "Mizoram" 16 "Tripura" 17 "Meghalaya" 19 "West Bengal" 25 "Daman and Diu"
			27 "Maharashtra" 28 "Andhra Pradesh" 29 "Karnataka" 30 "Goa" 32 "Kerala"
			33 "Tamil Nadu" 34 "Puducherry" 35 "Andaman and Nicobar Islands" 36 "Telangana" ;
	#delimit cr
	lab value state state2
		lab def female1 0 "male" 1 "female"
		lab val female female1
	lab def educ1 1 "<Primary or illiterate" 2 "Primary" 3 "Middle" 4 "Secondary" 5 "High School" 6 ">High School"
	lab val educ educ1
		lab def rural1 0 "urban" 1 "rural"
		lab val rural rural1
	lab def data1 0 "AHS" 1 "DLHS"
	lab val DLHS data1
		lab def caste2 1 "SC" 2 "ST" 3 "Other/None" 
		lab val caste caste2
	lab def yesno 0 "no" 1 "yes"
	
	#delimit ;
	foreach v in married diab_treated htn_treated fasted htn_diagnosed
		diab_diagnosed glucgrt200 diab_narrow diab_broad htn_narrow_avg 
		htn_broad_avg htn_narrow_sec htn_broad_sec htn_stage2_avg htn_stage2_sec 
		htnurgency_avg htnurgency_sec pregnant dropage BMIgrt25 obese overweight 
		impwater impsani fuel house light owner radio tv comp phone wash refri
		sewing bike scooter car land anaemia sevanaemia{ ;
	#delimit cr
		lab val `v' yesno
	}
		
	lab var prim_key "unique identifier"
	lab var female "gender"
	lab var educ "educational attainment"
	lab var educ_cont "years of schooling"
	lab var age1 "age based on completed years Q08"
	lab var age2 "age in years based on exact age Q79"
	lab var married "married at time of survey"
	lab var rural "location of household"
	lab var diab_treated "regular treatment of diabetes"
	lab var htn_treated "regular treatment of hypertension"
	lab var fasted "being fasted"
	lab var diab_diagnosed "was diabetes diagnosed"
	lab var htn_diagnosed "was hypertension diagnosed"
	lab var glucose "venous blood sugar level cleaned"
	lab var glucgrt200 "venous blood sugar level above 200"
	lab var diab_narrow  "suffers from diabetes according to test result"
	lab var diab_broad "suffers from diabetes according to test result or takes medication"
	lab var bpsyst_frst "first reading systolic cleaned"
	lab var bpsyst_sec "second reading systolic cleaned"
	lab var bpsyst_avg "average of readings systolic"
	lab var bpdiast_frst "first reading diastolic cleaned"
	lab var bpdiast_sec "second reading diastolic cleaned"
	lab var bpdiast_avg "average of readings diastolic"
	lab var htn_narrow_avg "suffers from hypertension, avg. reading"
	lab var htn_broad_avg "suffers from hypertension (avg. reading) or takes medication"
	lab var htn_narrow_sec "suffers from hypertension, second reading"
	lab var htn_broad_sec "suffers from hypertension (second reading) or takes medication"
	lab var htn_stage2_avg "second stage hypertension, average reading"
	lab var htn_stage2_sec "second stage hypertension, second reading"
	lab var htnurgency_avg "hypertensive urgency, average reading"
	lab var htnurgency_sec "hypertensive urgency, second reading"
	lab var pregnant "individual is pregnant"
	lab var height "height in meters"
	lab var weight "weight in kgs"
	lab var dropage "definitely under 18 or all ages missing"
	lab var BMI "Body Mass Index"
	lab var overweight "indiv is overweight"
	lab var obese "indiv is obese"
	lab var BMIgrt25 "BMI >25"
	lab var DLHS "data from DLHS or AHS dataset"
	lab var hhid "household identifier"
	lab var hhsize "number of hh members"
	lab var childhh "no. of children per HH"
	lab var adulthh "no. of adults per HH"
	lab var femalehh "no. of female hh members"
	lab var malehh "no. of male hh members"
	lab var notshared "sanitation facility is not shared"
	lab var caste "caste category"
	lab var impwater "HH has access to improved water supply"
	lab var impsani "HH uses improved, unshared sanitation facility"
	lab var fuel "HH uses modern cooking fuel"
	lab var house "HH lives in a Puccha"
	lab var light "HH uses clean source of lighting"
	lab var radio "HH owns at least one radio"
	lab var tv "HH owns at least one TV"
	lab var comp "HH owns at least one computer"
	lab var phone "HH owns at least one mobile/landline phone"
	lab var wash "HH owns at least one washing machine"
	lab var refri "HH owns at least one fridge"
	lab var sewing "HH owns at least one sewing machine"
	lab var bike "HH owns at least one bike"
	lab var scooter "HH owns at least one scooter"
	lab var car "HH owns at least one car"
	lab var land "HH owns any land"
	lab var hemoglobin "hemoglobin level in g/dL"
	lab var anaemia "indiv suffers from anaemia"
	lab var sevanaemia "indiv suffers from severe anaemia"

save "$final\DLHS_cleaned", replace
}



{ /*** MORTALITY DATA FROM HOUSEHOLD DATA SET ***/
use "$rawdlhs\DLHS-4 HH", clear

	keep state dist psu psu_n prim_key hv60* hv61* hv62* hv55 hv02*

* delete households where nobody died since 2008
	keep if hv55 == 1

	destring prim_key, replace
	format prim_key %20.0g

* reshape the data from wide to long
	keep hv60* hv61* hv62* hv02* prim_key
	reshape long hv60 hv61 hv62, i(prim_key) j(hv02, string)
	drop hv02*
	
	rename (hv60 hv61 hv62) (days months years)
	drop if years == . & days == . & months == .
	
* dummy for deceased children/infants
	gen infdeath = (days != . | months != .)
	gen u5death = (infdeath == 1 | years < 5)
	
* total no. of deceased children/infants per HH
	bysort prim_key: egen total_infdeath = total(infdeath)
	bysort prim_key: egen total_u5death = total(u5death)	
	
* only keep one observation per HH
	bysort prim_key: gen hhno = 1 if _n == 1
	drop if hhno != 1
	
/* generate the hhid that was also used in CAB data.
 For this we have to cut off the last two digits which always are 0
*/
	gen double hhid = prim_key/100
	format hhid prim_key %20.0g
	sort hhid

	
* indicator that data is from HH data set
	gen MORT = 1

	
*** LABELLING VARIABLES ***
	lab var infdeath "child died before turning 1y"
	lab var u5death "child died before turning 5y"
	lab var total_infdeath "total number of deceased infants in HH"
	lab var total_u5death "total number of deceased U5 children in HH"
	lab var hhid "household identifier"
	lab var MORT "data obtained from MORT(AHS) / HH(DLHS) data sets"

drop prim_key infdeath u5death years days months hhno

	
save "$final\HH_mort", replace
}

{ /*** MORTALITY DATA FROM WOMAN DATA SET ***/
use "$rawdlhs\DLHS-4 EW", clear

	keep q120 q122 q123_d q123_s q124 q125_d q125_s q126 q127_b q127_g prim_key state dist

	rename (q120 q122 q123_d q123_s q124 q125_d q125_s q126 q127_b q127_g) ///
		(delivered childinhh girlinhh boyinhh childouthh girlouthh boyouthh deceased decboy decgirl)
		
* generate household identifier
	sort prim_key
	gen hhid = substr(prim_key, 1,13)
	destring hhid, replace
	sort hhid
		
* only keep women that ever have delivered a child
	keep if delivered == 1

* generate total number of live births and deceased children per woman
	egen btot = rowtotal(girlinhh boyinhh girlouthh boyouthh decboy decgirl)
	
	foreach v in girlinhh boyinhh girlouthh boyouthh decboy decgirl {
		replace `v' = . if `v' == .r
	}
	
	egen dtot = rowtotal(decboy decgirl)
	
* count total number of live births and deceased children per household
	by hhid: egen total_birth = total(btot)
	by hhid: egen total_death = total(dtot)
	
	drop if total_birth == 0
	
* only keep 1 obs per HH
	bysort hhid: gen hhno = 1 if _n == 1
	drop if hhno != 1
	drop hhno
	
	gen mort_women = total_death/total_birth
	
* indicator that data from the EW data set was used
	gen WOMAN = 1
	
*** LABELLING VARIABLES ***
	lab var btot "total number of livebirths per woman"
	lab var dtot "total number of deceased children per woman"
	lab var total_birth "total number of live births in HH, WOMAN data"
	lab var total_death "total number of deceased children in HH, WOMAN data"
	lab var mort_women "mortality ratio by HH; total deaths/total births; not age specific"
	lab var WOMAN "data from WOMAN(AHS) / EW(DLHS) data set"
	
	drop total_birth total_death prim_key delivered childinhh girlinhh boyinhh childouthh girlouthh boyouthh deceased decboy decgirl btot dtot

save "$final\EW_mort", replace
}
{ /*** merge the mortality data sets with the CAB data set ***/

use "$final\DLHS_cleaned", clear
	merge m:1 hhid using "$final\HH_mort"
	drop if _merge == 2 // drop observations that only are contained in HH data
	drop _merge
	
/* HH that were not in HH_mort data will have missings in total_infdeath and
	total_u5death. These values will be kept as missings in order to be able
	to distinguish them from the HHs where no child died.
*/
	replace MORT = 0 if MORT == .
	
	merge m:1 hhid using "$final\EW_mort"
	drop if _merge == 2 // drop observations that only are contained in EW data
	drop _merge
	
	replace WOMAN = 0 if WOMAN == .
	
{ /* generate survey date, birth date and reference period variables */

* reformat the separate survey date variables
/* the time stamp has some implausible values. It might be better to use the
		day, month and year variables.
	Furthermore, these three variables and the time stamp differ for 78% of the
		observations
	sdate has no missings
*/
	gen sdate = mdy(qsinterviewmonth, qsinterviewdate, qsinterviewyear)
	format sdate %td
	lab var sdate "starting date of interview, based on qs variables"

* reformat the time stamp 
/* in case the time stamp is used, then the interview starting time should be 
	used because there we only have 2 missing values (end time has more missings)
*/
	gen double stime = clock(tsstart , "YMDhms")
	gen sdate2 = dofc(stime)
	format sdate2 %td
	replace sdate2 = . if sdate2 < td(01jan2012)
	drop stime
	lab var sdate2 "starting date of interview, based on time stamp"
	
	
* start date of reference period
	gen refdate_1 = "2008/01/01"
	gen double refdate_2 = clock(refdate_1, "YMD")
	gen refstart = dofc(refdate_2)
	format refstart %td
	drop refdate*
	lab var refstart "reference period started on 01/01/2008"
	
* calculate length of reference period
	gen refdays = sdate - refstart
	lab var refdays "number of days in reference period"
	
* new birth day, month and year variables
	gen bday = hv07d
	replace bday = . if bday == 98
	gen bmonth = hv07m
	replace bmonth = . if bmonth == 98
	gen byear = hv07y
	replace byear = . if byear == 9998

* age at survey date
	gen bdate = mdy(bmonth, bday, byear)
	format bdate %td
	 
	gen age3 = sdate - bdate
	* replace age3 = . if age3 < 0 // three obs have a negative age3 value if sdate2 is used
	lab var age3 "age in days, bases: birth and survey dates"
}

{ /* MORTALITY RATES */
* dummy for infant
	gen infant = 1 if age3 < 365
	replace infant = 0 if age3 >= 365 & age3 != .
	lab var infant "indiv was < 1y at survey date"
	
* dummy for U5
	gen u5 = 1 if age3 < 1825
	replace u5 = 0 if age3 >= 1825 & age3 != .
	lab var u5 "indiv was <5y at survey date"
	
* days lived in reference period
/* individuals that are infants at the time of survey lived exactly this number
		of days during the reference period.
		
	The same is true for children who are younger than 5
*/
	gen refdays_infant = age3 if infant == 1
	lab var refdays_infant "number of days being <1y lived during reference period"
	
	gen refdays_u5 = age3 if u5 == 1
	lab var refdays_u5 "number of days being <5y lived during reference period"
	
/* as the reference period is longer than 1 year there might be some individuals
		that were born during the reference period but are not an infant anymore at the
		survey date.
	For these individuals, refdays_infant will be set 365.25.
	
	For some HH the reference period is also longer than 5 years. Hence, if there
		are children who were born and turned 5 during the reference period, 
		ref_u5 will be set 1826.25 (5*365.25)
*/
	replace refdays_infant = 365 if bdate >= td(01jan2008) & bdate < sdate & ///
		age3 >= 365 & sdate != .
		
	replace refdays_u5 = 1825 if bdate >= td(01jan2008) & bdate < sdate & ///
		age3 >= 1825 & sdate != .
	
/* there are individuals that were born before the reference period started
	and turned 1 (or 5) during the reference period (i.e. they were born after 
	01.01.2007 or 01.01.2003 respectively). 
	For these observations, refdays_infant (refdays_u5) will take the value 
	of days that the individual was younger than 1 (younger than 5) during the 
	reference period.
*/
	replace refdays_infant = 365 - (refstart - bdate) if bdate < td(01jan2008) & bdate >= td(01jan2007) & bdate != .
	
	replace refdays_u5 = 1825 - (refstart - bdate) if bdate < td(01jan2008) & bdate >= td(01jan2003) & bdate != .
	
/* refdays_infant (refdays_u5) will take the value of 0 if an individual was not 
		younger than 1y (5y) at any point in time during the reference period
*/
	replace refdays_infant = 0 if bdate < td(01jan2007)
	
	replace refdays_u5 = 0 if bdate < td(01jan2003)
	
* proportion of number of days being <1y (<5y) during reference period
	gen prop_infant = refdays_infant / 365
	lab var prop_infant "proportion of time being <1y during reference period"
	
	gen prop_u5 = refdays_u5 / 1825
	lab var prop_u5 "proportion of time being <5y during reference period"
	
* sum up proportions per HH
	bysort hhid: egen hhprop_infant = total(prop_infant) if prop_infant != . | (prop_infant == . & age1 >= 1)
	
	bysort hhid: egen hhprop_u5 = total(prop_u5) if prop_u5 != . | (prop_u5 == . & age1 >= 5)
	
* infant mortality rate
/* total_infdeath is the number of deceased infants in this household. Data was
	taken from the household data set
*/
	by hhid: gen infmortrate = total_infdeath / (total_infdeath + hhprop_infant)
	* people who have infants but no death in ref period, have a death rate of 0
	replace infmortrate = 0 if total_infdeath == . & !inlist(hhprop_infant, 0, .)
	
	
	by hhid: gen u5mortrate = total_u5death / (total_u5death + hhprop_u5)
	* people who have u5 children but no death in ref period, have a death rate of 0
	replace u5mortrate = 0 if total_u5death == . & !inlist(hhprop_u5, 0,.)
	

* drop variables that will not be needed
	drop total_infdeath infant refdays_infant prop_infant hhprop_infant age3 ///
		refdays bday bmonth byear bdate u5 refdays_u5 prop_u5 hhprop_u5 ///
		 sdate sdate2 refstart total_u5death

save "$temp\DLHS_temp", replace
}
}

{ /**** CALCULATING SAMPLING WEIGHTS THAT CAN BE USED IN NATIONAL ANALYSIS ***/
use "$pop\adultpopulationbystate", clear

* keep only DLHS states	
	keep if inlist(state, 2,3,4,6,7,11,12,13,14,15,16,17,19,25,27,28,29,30,32,33,34,35)

* new state identifier
	sort state
	gen tstate = _n

* total population in all DLHS states
	egen totaladultpop_cens = total(adultpopstate)
	format totaladultpop_cens %15.0g
		
* relative population of each state wrt total DLHS population
	gen relpop_cens = adultpopstate / totaladultpop_cens

* labelling variables
	lab var totaladultpop_cens "total number of adults living in DLHS states, Census2011"
	lab var relpop_cens "relative adult population per state, Census2011"

* keep only relevant variables
	keep tstate relpop_cens totaladultpop_cens
	
save "$final\DLHS_relativepopulation_census", replace
	
**********************************************************************************
use "$temp\DLHS_temp", clear

	keep if dropage == 0

/* Telagana split off Andhra Pradesh after the Census2011. We will have to treat 
	it as the same state */
	
* new state identifier
	bysort state: gen tstate = 1 if _n == 1
	replace tstate = sum(tstate)
	
* AP is tstate = 16; Telagana = 23
	replace tstate = 16 if tstate == 23
	
	bysort tstate: gen stateadultpop = _n
	by tstate: replace stateadultpop = _N
	
	by tstate: gen stateno = 1 if _n == 1
	keep if stateno == 1
	
	egen totaladultpop_cab = total(stateadultpop)
	gen relpop_cab = stateadultpop / totaladultpop_cab
	
	keep tstate relpop_cab totaladultpop_cab
	
	lab var relpop_cab "relative population per state, sample (DLHS data)"

save "$final\relativepopulation_cab", replace

* merge our data set with the relative district size data sets
use "$temp\DLHS_temp", clear
	
* new state identifier
	bysort state: gen tstate = 1 if _n == 1
	replace tstate = sum(tstate)
* AP is tstate = 16; Telagana = 23
	replace tstate = 16 if tstate == 23
	
	merge m:1 tstate using "$final\DLHS_relativepopulation_census"
	drop _merge
	
	merge m:1 tstate using "$final\relativepopulation_cab"
	drop _merge
	
	gen relweight = relpop_cens / relpop_cab
	
	
	gen sweight = shhwt*relweight
	
* there are 0.25% missing values in sweight caused by missing values in shhwt

/* 
generate a variable that indicates the ratio of people included in the sample wrt
	the total population which will be needed for a later adjustment for the 
	relative sample sizes of AHS and DLHS
*/
	gen sampleratio_DLHS = totaladultpop_cab / totaladultpop_cens

* Labelling variables
	lab var relweight "weight adjusting for adults per state"
	lab var sweight "final sampling weight adjusting for state size(DLHS)/merging&distr size(AHS)"
	lab var sampleratio_DLHS "Sample/Census; needed for later calculations"
	
	drop relpop_cab relpop_cens tstate totaladultpop_cab totaladultpop_cens

save "$final\DLHS_mort", replace
}

